home *** CD-ROM | disk | FTP | other *** search
- unit JimParse;
-
- interface
-
- uses
- SysUtils, Classes;
-
-
- type
- TjimToken = class(TObject)
- private
- FTokenType : Integer;
- FAsString : string;
- public
- property TokenType : Integer read FTokenType write FTokenType;
- property AsString : string read FAsString write FAsString;
- end;
-
-
- TjimLexicalAnalyser = class(TObject)
- private
- FText : string;
- FPosition : Integer;
-
- procedure SetText(const Value : string);
- public
- constructor Create;
-
- procedure GetNextToken(NextToken : TjimToken);
- property Text : string read FText write SetText;
- end;
-
-
- TjimSymbolType = (stTitle,stBase,stLink,stImage);
-
-
- TjimSymbol = class(TCollectionItem)
- private
- FSymbolType : TjimSymbolType;
- FSymbolValue : string;
- public
- procedure Assign(Source : TPersistent); override;
-
- property SymbolType : TjimSymbolType read FSymbolType write FSymbolType;
- property SymbolValue : string read FSymbolValue write FSymbolValue;
- end;
-
-
- TjimSymbolTable = class(TCollection)
- private
- function GetItem(Index : Integer) : TjimSymbol;
- procedure SetItem(Index : Integer;Value : TjimSymbol);
- public
- function Add : TjimSymbol;
- function AddSymbol(SymType : TjimSymbolType;SymValue : string) : TjimSymbol;
-
- property Items[Index : Integer] : TjimSymbol read GetItem write SetItem; default;
- end;
-
-
- TjimHtmlParser = class(TObject)
- private
- FLookahead : TjimToken;
- FLexAnalyser : TjimLexicalAnalyser;
- FSymbolTable : TjimSymbolTable;
- FLastTag : string;
-
- procedure Match(T : Integer);
- procedure ConsumeWhiteSpace;
- procedure Document;
- procedure Tag;
- procedure Data;
- procedure TagName;
- procedure AttributeList;
- function AttributeName : string;
- function Value : string;
- function Identifier : string;
- function QuotedValue : string;
- function PlainValue : string;
- public
- constructor Create;
- destructor Destroy; override;
-
- procedure Parse(const DocString : string);
-
- property SymbolTable : TjimSymbolTable read FSymbolTable;
- end;
-
-
- EjimHtmlParserError = class(Exception);
-
-
- implementation
-
- const
- // Token types are the characters 0 to 255, along with the following
- ttEndOfDoc = -1;
-
-
-
- // --------------------------- TjimLexicalAnalyser ---------------------------
-
- constructor TjimLexicalAnalyser.Create;
- begin {Create}
- FText := '';
- FPosition := 0;
- end; {Create}
-
-
- procedure TjimLexicalAnalyser.SetText(const Value : string);
- begin {SetText}
- if FText = Value then begin
- // Only proceed if setting a new string
- Exit;
- end;
-
- FPosition := 1;
- FText := Value;
- end; {SetText}
-
-
- procedure TjimLexicalAnalyser.GetNextToken(NextToken : TjimToken);
- begin {GetNextToken}
-
- // Read the next character
- if FPosition > Length(FText) then begin
- // At the end of the document
- NextToken.AsString := #0;
- NextToken.TokenType := ttEndOfDoc;
- end else begin
- // Return the character
- NextToken.AsString := FText[FPosition];
- NextToken.TokenType := Integer(FText[FPosition]);
- Inc(FPosition);
- end;
- end; {GetNextToken}
-
-
- // ------------------------------- TjimSymbol -------------------------------
-
- procedure TjimSymbol.Assign(Source : TPersistent);
- begin {Assign}
- if Source is TjimSymbol then begin
- SymbolType := TjimSymbol(Source).SymbolType;
- SymbolValue := TjimSymbol(Source).SymbolValue;
- Exit;
- end;
-
- inherited Assign(Source);
- end; {Assign}
-
-
- // ------------------------------ TjimSymbolTable ----------------------------
-
- function TjimSymbolTable.GetItem(Index : Integer) : TjimSymbol;
- begin {GetItem}
- Result := TjimSymbol(inherited GetItem(Index));
- end; {GetItem}
-
-
- procedure TjimSymbolTable.SetItem(Index : Integer;Value : TjimSymbol);
- begin {SetItem}
- inherited SetItem(Index,Value);
- end; {SetItem}
-
-
- function TjimSymbolTable.Add : TjimSymbol;
- begin {Add}
- Result := TjimSymbol(inherited Add);
- end; {Add}
-
-
- function TjimSymbolTable.AddSymbol(SymType : TjimSymbolType;SymValue : string) : TjimSymbol;
- var
- i : Integer;
- begin {AddSymbol}
- Result := nil;
-
- // Check whether symbol is already in the list
- for i := 0 to Count - 1 do begin
- if (Items[i].SymbolType = SymType) and (Items[i].SymbolValue = SymValue) then begin
- Exit;
- end;
- end;
-
- Result := Add;
- Result.SymbolType := SymType;
- result.SymbolValue := SymValue;
- end; {AddSymbol}
-
-
- // ------------------------------ TjimHtmlParser -----------------------------
-
- procedure TjimHtmlParser.ConsumeWhiteSpace;
- // Eats 'whitespace' ie chars 0 to 32 inclusive. Here instead of lexical
- // analyser because white space is allowed sometimes.
- begin {ConsumeWhiteSpace}
- while (FLookahead.TokenType <> ttEndOfDoc) and
- (FLookAhead.AsString <= ' ') do begin
- FLexAnalyser.GetNextToken(FLookAhead);
- end;
- end; {ConsumeWhiteSpace}
-
-
- procedure TjimHtmlParser.Match(T : Integer);
- // If the token type T matches the FLookahead token type then FLookAhead is
- // set to the next token, otherwise an exception is raised
- begin {Match}
- if FLookahead.TokenType = T then begin
- FLexAnalyser.GetNextToken(FLookahead);
- end else begin
- raise EjimHtmlParserError.Create('HTML syntax error. Expected ' +
- IntToStr(FLookahead.TokenType));
- end;
- end; {Match}
-
-
- procedure TjimHtmlParser.Document;
- begin {Document}
- while FLookahead.TokenType <> ttEndOfDoc do begin
- ConsumeWhiteSpace;
-
- if FLookahead.AsString = '<' then begin
- Tag;
- end else begin
- Data;
- end;
- end;
-
- Match(ttEndOfDoc);
- end; {Document}
-
-
- procedure TjimHtmlParser.Tag;
- begin {Tag}
- Match(Ord('<'));
- ConsumeWhiteSpace;
-
- if FLookahead.AsString = '/' then begin
- // Finding an end tag
- Match(Ord('/'));
- FLastTag := '/';
- ConsumeWhiteSpace;
- TagName;
- end else begin
- // Finding a start tag, or a tag that doesn't enclose anything
- FLastTag := '';
- ConsumeWhiteSpace;
- TagName;
- ConsumeWhiteSpace;
- AttributeList;
- end;
-
- Match(Ord('>'));
- end; {Tag}
-
-
- procedure TjimHtmlParser.Data;
- var
- TitleStr : string;
- begin {Data}
- TitleStr := '';
-
- while (FLookahead.AsString <> '<') and
- (FLookahead.TokenType <> ttEndOfDoc) do begin
- // Collect the title string. It is ok to search like this because no other
- // tags are allowed in a title
- if CompareText(FLastTag,'Title') = 0 then begin
- TitleStr := TitleStr + FLookahead.AsString;
- end;
-
- Match(FLookahead.TokenType);
- end;
-
- if TitleStr > '' then begin
- FSymbolTable.AddSymbol(stTitle,TitleStr);
- end;
- end; {Data}
-
-
- procedure TjimHtmlParser.TagName;
- begin {TagName}
- FLastTag := FLastTag + Identifier;
-
- if FLastTag = '!--' then begin
- // In a comment tag. Treat this specially by ignoring all characters
- // until the end of the comment tag
- repeat
- if FLookahead.AsString = '-' then begin
- FLastTag := FLastTag + FLookahead.AsString;
- end else begin
- FLastTag := '';
- end;
-
- Match(FLookahead.TokenType);
- until FLastTag = '--';
- end else if CompareText(FLastTag,'META') = 0 then begin
- // In a META tag. There is all sorts of rubbish here, so consume it all
- // until the end of the tag
- while FLookahead.AsString <> '>' do begin
- Match(FLookahead.TokenType);
- end;
- end;
- end; {TagName}
-
-
- procedure TjimHtmlParser.AttributeList;
- var
- FLastAttribute : string;
- FLastValue : string;
- begin {AttributeList}
- while FLookahead.AsString <> '>' do begin
- FLastAttribute := AttributeName;
- ConsumeWhiteSpace;
-
- if FLookahead.AsString = '=' then begin
- Match(Ord('='));
- ConsumeWhiteSpace;
- FLastValue := Value;
- ConsumeWhiteSpace;
-
- // Should only get here if FLastAttribute is not an empty string
- if (CompareText(FLastTag,'BASE') = 0) and
- (CompareText('HREF',FLastAttribute) = 0) then begin
- // Special case when found the HREF attribute of a BASE tag
- FSymbolTable.AddSymbol(stBase,FLastValue);
- end else if (CompareText(FLastTag,'IMG') = 0) and
- (CompareText('SRC',FLastAttribute) = 0) then begin
- // Found an image
- FSymbolTable.AddSymbol(stImage,FLastValue);
- end else if ((CompareText(FLastTag,'A') = 0) or
- (CompareText(FLastTag,'AREA') = 0) or
- (CompareText(FLastTag,'LINK') = 0)) and
- (CompareText('HREF',FLastAttribute) = 0) then begin
- // Found an ordinary link
- FSymbolTable.AddSymbol(stLink,FLastValue);
- end else if (CompareText(FLastTag,'FRAME') = 0) and
- (CompareText('SRC',FLastAttribute) = 0) then begin
- // Found an ordinary link
- FSymbolTable.AddSymbol(stLink,FLastValue);
- end;
- end;
- end;
- end; {AttributeList}
-
-
- function TjimHtmlParser.AttributeName : string;
- begin {AttributeName}
- Result := '';
-
- if FLookahead.AsString = '"' then begin
- Result := QuotedValue;
- end else begin
- Result := Identifier;
- end;
- end; {AttributeName}
-
-
- function TjimHtmlParser.Value : string;
- begin {Value}
- Result := '';
-
- if FLookahead.AsString = '"' then begin
- Result := QuotedValue;
- end else begin
- Result := PlainValue;
- end;
- end; {Value}
-
-
- function TjimHtmlParser.Identifier : string;
- const
- IdentifierSet = ['A'..'Z','a'..'z','0'..'9','-','!'];
- begin {Identifier}
- Result := '';
-
- if (Length(FLookahead.AsString) >= 1) and
- (not (FLookahead.AsString[1] in IdentifierSet)) then begin
- raise EjimHtmlParserError.Create('HTML syntax error. Expected identifier, ' +
- 'but got : ' + FLookahead.AsString +
- ' in tag ' + FLastTag);
- end;
-
- repeat
- Result := Result + FLookahead.AsString;
-
- if Result = '!--' then begin
- // Found a comment tag. Some people eg Microsoft, don't put a space after
- // this part of the tag
- Exit;
- end;
-
- Match(FLookahead.TokenType);
- until not (FLookahead.AsString[1] in IdentifierSet);
- end; {Identifier}
-
-
- function TjimHtmlParser.QuotedValue : string;
- begin {QuotedValue}
- Result := '';
- Match(Ord('"'));
-
- while FLookahead.AsString <> '"' do begin
- Result := Result + FLookahead.AsString;
- Match(FLookahead.TokenType);
- end;
-
- Match(Ord('"'));
- end; {QuotedValue}
-
-
- function TjimHtmlParser.PlainValue : string;
- const
- PlainValueSet = ['A'..'Z','a'..'z','0'..'9','-','.','+','-',':','/','?',
- ''''];
- begin {PlainValue}
- Result := '';
-
- if (Length(FLookahead.AsString) >= 1) and
- (not (FLookahead.AsString[1] in PlainValueSet)) then begin
- raise EjimHtmlParserError.Create('HTML syntax error. Expected plain value, ' +
- 'but got : ' + FLookahead.AsString +
- ' in tag ' + FLastTag);
- end;
-
- repeat
- Result := Result + FLookahead.AsString;
- Match(FLookahead.TokenType);
- until not (FLookahead.AsString[1] in PlainValueSet);
- end; {PlainValue}
-
-
- constructor TjimHtmlParser.Create;
- begin {Create}
- FLookahead := TjimToken.Create;
- FLexAnalyser := TjimLexicalAnalyser.Create;
- FSymbolTable := TjimSymbolTable.Create(TjimSymbol);
- end; {Create}
-
-
- destructor TjimHtmlParser.Destroy;
- begin {Destroy}
- FLookahead.Free;
- FLexAnalyser.Free;
- FSymbolTable.Free;
- end; {Destroy}
-
-
- procedure TjimHtmlParser.Parse(const DocString : string);
- begin {Parse}
- if DocString = '' then begin
- Exit;
- end;
-
- FLastTag := '';
- FLexAnalyser.Text := DocString;
- FLexAnalyser.GetNextToken(FLookahead);
- Document;
- end; {Parse}
-
-
- end.
-